1   /*
2    * Copyright (c) 2003, Oracle and/or its affiliates. All rights reserved.
3    * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4    *
5    * This code is free software; you can redistribute it and/or modify it
6    * under the terms of the GNU General Public License version 2 only, as
7    * published by the Free Software Foundation.  Oracle designates this
8    * particular file as subject to the "Classpath" exception as provided
9    * by Oracle in the LICENSE file that accompanied this code.
10   *
11   * This code is distributed in the hope that it will be useful, but WITHOUT
12   * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13   * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14   * version 2 for more details (a copy is included in the LICENSE file that
15   * accompanied this code).
16   *
17   * You should have received a copy of the GNU General Public License version
18   * 2 along with this work; if not, write to the Free Software Foundation,
19   * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20   *
21   * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22   * or visit www.oracle.com if you need additional information or have any
23   * questions.
24   */
25  
26  /**
27   * This is a tool to generate categoryNames and categoryMap which are used in
28   * CharSet.java.
29   */
30  
31  package build.tools.generatebreakiteratordata;
32  
33  import java.io.BufferedReader;
34  import java.io.BufferedWriter;
35  import java.io.FileReader;
36  import java.io.FileWriter;
37  import java.util.StringTokenizer;
38  
39  class CharacterCategory {
40  
41      /**
42       * A list of Unicode category names.
43       */
44      static final String[] categoryNames = {
45          "Ll",        /* Letter, Lowercase */
46          "Lu",        /* Letter, Uppercase */
47          "Lt",        /* Letter, Titlecase */
48          "Lo",        /* Letter, Other */
49          "Lm",        /* Letter, Modifier */
50          "Nd",        /* Number, Decimal Digit */
51          "Nl",        /* Number, Letter */
52          "No",        /* Number, Other */
53          "Ps",        /* Punctuation, Open */
54          "Pe",        /* Punctuation, Close */
55          "Pi",        /* Punctuation, Initial quote */
56          "Pf",        /* Punctuation, Final quote */
57          "Pd",        /* Punctuation, Dash */
58          "Pc",        /* Punctuation, Connector */
59          "Po",        /* Punctuation, Other */
60          "Sc",        /* Symbol, Currency */
61          "Sm",        /* Symbol, Math */
62          "So",         /* Symbol, Other */
63          "Mn",        /* Mark, Non-Spacing */
64          "Mc",        /* Mark, Spacing Combining */
65          "Me",        /* Mark, Enclosing */
66          "Zl",        /* Separator, Line */
67          "Zp",        /* Separator, Paragraph */
68          "Zs",        /* Separator, Space */
69          "Cc",        /* Other, Control */
70          "Cf",        /* Other, Format */
71          "--",        /* Dummy, ignored */
72          // Don't add anything after the Dummy entry!!
73      };
74  
75      /**
76       * A array of Unicode code points for each category.
77       */
78      private static int[][] categoryMap;
79  
80  
81      /**
82       * Generates CategoryMap for GenerateBreakIteratorData.
83       */
84      static void makeCategoryMap(String filename) {
85          /* Overwrite specfile name */
86          specfile = filename;
87  
88          /* Generate data in current format (1.5.0) */
89          generateNewData();
90  
91          /* Copy generated data to cateogyMap */
92          categoryMap = new int[categoryNames.length-1][];
93          for (int i = 0; i < categoryNames.length-1; i++) {
94              int len = newListCount[BMP][i] + newListCount[nonBMP][i];
95              categoryMap[i] = new int[len];
96              System.arraycopy(newList[i], 0, categoryMap[i], 0, len);
97          }
98      }
99  
100     /**
101      * Returns categoryMap for the given category.
102      */
103     static int[] getCategoryMap(int category) {
104         return categoryMap[category];
105     }
106 
107 
108     /**
109      * Only used for debugging and generating a test program.
110      */
111     public static void main(String[] args) {
112         /* Parses command-line options */
113         processArgs(args);
114 
115         /* Generates data in current format (1.5.0) */
116         generateNewData();
117 
118         /*
119          * Generates data in older format (1.4.X and earlier) and creates
120          * the old CategoryMap if "oldFilename" is not null.
121          */
122         if (!oldDatafile.equals("")) {
123             generateOldData();
124             generateOldDatafile();
125         }
126 
127         /* Displays summary of generated data */
128          showSummary();
129 
130         /*
131          * Generates a test program which compares the new data and the return
132          * values of Character.getType().
133          * and the old data and the new data.
134          */
135         generateTestProgram();
136     }
137 
138 
139     /**
140      * Spec (Unicode data file)
141      */
142     private static String specfile = "UnicodeData.txt";
143 
144     /**
145      * Output directory
146      */
147     private static String outputDir = "";
148 
149     /**
150      * Old data filename
151      */
152     private static String oldDatafile = "";
153 
154     /**
155      * Parses the specified arguments and sets up the variables.
156      */
157     private static void processArgs(String[] args) {
158         for (int i = 0; i < args.length; i++) {
159             String arg =args[i];
160             if (arg.equals("-spec")) {
161                 specfile = args[++i];
162             } else if (arg.equals("-old")) {
163                 oldDatafile = args[++i];
164             } else if (arg.equals("-o")) {
165                 outputDir = args[++i];
166             } else {
167                 System.err.println("Usage: java CharacterCategory [-spec specfile]");
168                 System.exit(1);
169             }
170         }
171     }
172 
173 
174     /**
175      * Displays summary of generated data
176      */
177     private static void showSummary() {
178         int oldSum = 0;
179         int newSum = 0;
180         int oldSuppSum = 0;
181         int newSuppSum = 0;
182 
183         for (int i = 0; i < categoryNames.length-1; i++) {
184             int newNum = newListCount[BMP][i] + newListCount[nonBMP][i];
185 
186             if (oldTotalCount[i] != newNum) {
187                 System.err.println("Error: The number of generated data is different between the new approach and the old approach.");
188             }
189             if (oldListCount[SURROGATE][i] != newListCount[nonBMP][i]) {
190                 System.err.println("Error: The number of generated supplementarycharacters is different between the new approach and the old approach.");
191             }
192 
193             System.out.println("    " + categoryNames[i] + ": " +
194                                oldTotalCount[i] +
195                                "(" + oldListCount[BEFORE][i] +
196                                " + " + oldListCount[SURROGATE][i] +
197                                " + " + oldListCount[AFTER][i] + ")" +
198                                " --- " + newNum +
199                                "(" + newListCount[BMP][i] +
200                                " + " + newListCount[nonBMP][i] + ")");
201 
202             oldSum += oldListCount[BEFORE][i] * 2 +
203                       oldListCount[SURROGATE][i] * 4 +
204                       oldListCount[AFTER][i] * 2;
205             newSum += newNum * 4 ;
206             oldSuppSum += oldListCount[SURROGATE][i] * 4;
207             newSuppSum += newListCount[nonBMP][i] * 4;
208         }
209 
210         System.out.println("\nTotal buffer sizes are:\n    " +
211                            oldSum + "bytes(Including " + oldSuppSum +
212                            "bytes for supplementary characters)\n    " +
213                            newSum + "bytes(Including " + newSuppSum +
214                            "bytes for supplementary characters)");
215 
216         if (!ignoredOld.toString().equals(ignoredNew.toString())) {
217             System.err.println("Ignored categories: Error: List mismatch: " +
218                                 ignoredOld + " vs. " + ignoredNew);
219         } else {
220             System.out.println("\nIgnored categories: " + ignoredOld);
221             System.out.println("Please confirm that they aren't used in BreakIteratorRules.");
222         }
223     }
224 
225 
226     private static final int HighSurrogate_CodeUnit_Start = 0xD800;
227     private static final int LowSurrogate_CodeUnit_Start  = 0xDC00;
228     private static final int Supplementary_CodePoint_Start    = 0x10000;
229 
230 
231     private static StringBuffer ignoredOld = new StringBuffer();
232     private static int[] oldTotalCount = new int[categoryNames.length];
233     private static int[][] oldListCount = new int[3][categoryNames.length];
234     private static int[][] oldListLen = new int[3][categoryNames.length];
235     private static StringBuffer[][] oldList = new StringBuffer[3][categoryNames.length];
236 
237     private static final int BEFORE = 0;
238     private static final int SURROGATE = 1;
239     private static final int AFTER = 2;
240 
241     /**
242      * Makes CategoryMap in ordler format which had been used by JDK 1.4.X and
243      * earlier versions.
244      */
245     private static void generateOldData() {
246         /* Initialize arrays. */
247         for (int i = 0; i<categoryNames.length; i++) {
248             for (int j = BEFORE; j <= AFTER; j++) {
249                 oldListCount[j][i] = 0;
250                 oldList[j][i] = new StringBuffer();
251                 oldListLen[j][i] = 17;
252             }
253         }
254 
255         storeOldData();
256 
257         if (oldTotalCount[categoryNames.length-1] != 1) {
258             System.err.println("This should not happen. Unicode data which belongs to an undefined category exists");
259             System.exit(1);
260         }
261     }
262 
263     private static void storeOldData() {
264         try {
265             FileReader fin = new FileReader(specfile);
266             BufferedReader bin = new BufferedReader(fin);
267 
268             String prevCode = "????";
269             String line;
270             int prevIndex = categoryNames.length - 1;
271             int prevCodeValue = -1;
272             int curCodeValue = 0;
273             boolean setFirst = false;
274 
275             while ((line = bin.readLine()) != null) {
276                 if (line.length() == 0) {
277                     continue;
278                 }
279 
280                 StringTokenizer st = new StringTokenizer(line, ";");
281                 String code = st.nextToken();
282 
283                 char c = code.charAt(0);
284                 if (c == '#' || c == '/') {
285                     continue;
286                 }
287 
288                 int i = Integer.valueOf(code, 16).intValue();
289 
290                 String characterName = st.nextToken();
291                 String category = st.nextToken();
292 
293                 int index;
294                 for (index = 0; index < categoryNames.length; index++) {
295                     if (category.equals(categoryNames[index])) {
296                         break;
297                     }
298                 }
299 
300                 if (index != categoryNames.length) {
301                     curCodeValue = Integer.parseInt(code, 16);
302                     if (prevIndex != index) {
303                         appendOldChar(prevIndex, prevCodeValue, prevCode);
304                         appendOldChar(index, curCodeValue, code);
305                         prevIndex = index;
306                     } else if (prevCodeValue != curCodeValue - 1) {
307                         if (setFirst && characterName.endsWith(" Last>")) {
308                             setFirst = false;
309                         } else {
310                             appendOldChar(prevIndex, prevCodeValue, prevCode);
311                             appendOldChar(index, curCodeValue, code);
312                         }
313                     }
314                     prevCodeValue = curCodeValue;
315                     prevCode = code;
316                     if (characterName.endsWith(" First>")) {
317                         setFirst = true;
318                     }
319                 } else {
320                     if (ignoredOld.indexOf(category) == -1) {
321                         ignoredOld.append(category);
322                         ignoredOld.append(' ');
323                     }
324                 }
325             }
326             appendOldChar(prevIndex, prevCodeValue, prevCode);
327 
328             bin.close();
329             fin.close();
330         }
331         catch (Exception e) {
332             throw new InternalError(e.toString());
333         }
334     }
335 
336     private static void appendOldChar(int index, int code, String s) {
337         int range;
338         if (code < HighSurrogate_CodeUnit_Start) {
339             range = BEFORE;
340         } else if (code < Supplementary_CodePoint_Start) {
341             range = AFTER;
342         } else {
343             range = SURROGATE;
344         }
345 
346         if (oldListLen[range][index] > 64) {
347             oldList[range][index].append("\"\n                + \"");
348             oldListLen[range][index] = 19;
349         }
350 
351         if (code == 0x22 || code == 0x5c) {
352             oldList[range][index].append('\\');
353             oldList[range][index].append((char)code);
354             oldListLen[range][index] += 2;
355         } else if (code > 0x20 && code < 0x7F) {
356             oldList[range][index].append((char)code);
357             oldListLen[range][index] ++;
358         } else {
359             if (range == SURROGATE) {// Need to convert code point to code unit
360                 oldList[range][index].append(toCodeUnit(code));
361                 oldListLen[range][index] += 12;
362             } else {
363                 oldList[range][index].append("\\u");
364                 oldList[range][index].append(s);
365                 oldListLen[range][index] += 6;
366             }
367         }
368         oldListCount[range][index] ++;
369         oldTotalCount[index]++;
370     }
371 
372     private static String toCodeUnit(int i) {
373         StringBuffer sb = new StringBuffer();
374         sb.append("\\u");
375         sb.append(Integer.toString((i - Supplementary_CodePoint_Start) / 0x400 + HighSurrogate_CodeUnit_Start, 16).toUpperCase());
376         sb.append("\\u");
377         sb.append(Integer.toString(i % 0x400 + LowSurrogate_CodeUnit_Start, 16).toUpperCase());
378         return sb.toString();
379     }
380 
381     private static int toCodePoint(String s) {
382         char c1 = s.charAt(0);
383 
384         if (s.length() == 1 || !Character.isHighSurrogate(c1)) {
385             return (int)c1;
386         } else {
387             char c2 = s.charAt(1);
388             if (s.length() != 2 || !Character.isLowSurrogate(c2)) {
389                 return -1;
390             }
391             return Character.toCodePoint(c1, c2);
392         }
393     }
394 
395 
396     private static StringBuffer ignoredNew = new StringBuffer();
397     private static int[] newTotalCount = new int[categoryNames.length];
398     private static int[][] newListCount = new int[2][categoryNames.length];
399     private static int[][] newList = new int[categoryNames.length][];
400 
401     private static final int BMP = 0;
402     private static final int nonBMP = 1;
403 
404     /**
405      * Makes CategoryMap in newer format which is used by JDK 1.5.0.
406      */
407     private static void generateNewData() {
408         /* Initialize arrays. */
409         for (int i = 0; i<categoryNames.length; i++) {
410             newList[i] = new int[10];
411         }
412 
413         storeNewData();
414 
415         if (newListCount[BMP][categoryNames.length-1] != 1) {
416             System.err.println("This should not happen. Unicode data which belongs to an undefined category exists");
417             System.exit(1);
418         }
419     }
420 
421     private static void storeNewData() {
422         try {
423             FileReader fin = new FileReader(specfile);
424             BufferedReader bin = new BufferedReader(fin);
425 
426             String line;
427             int prevIndex = categoryNames.length - 1;
428             int prevCodeValue = -1;
429             int curCodeValue = 0;
430             boolean setFirst = false;
431 
432             while ((line = bin.readLine()) != null) {
433                 if (line.length() == 0) {
434                     continue;
435                 }
436 
437                 StringTokenizer st = new StringTokenizer(line, ";");
438                 String code = st.nextToken();
439 
440                 char c = code.charAt(0);
441                 if (c == '#' || c == '/') {
442                     continue;
443                 }
444 
445                 int i = Integer.valueOf(code, 16).intValue();
446 
447                 String characterName = st.nextToken();
448                 String category = st.nextToken();
449 
450                 int index;
451                 for (index = 0; index < categoryNames.length; index++) {
452                     if (category.equals(categoryNames[index])) {
453                         break;
454                     }
455                 }
456 
457                 if (index != categoryNames.length) {
458                     curCodeValue = Integer.parseInt(code, 16);
459                     if (prevIndex == index) {
460                         if (setFirst) {
461                             if (characterName.endsWith(" Last>")) {
462                                 setFirst = false;
463                             } else {
464                                 System.err.println("*** Error 1 at " + code);
465                             }
466                         } else {
467                             if (characterName.endsWith(" First>")) {
468                                 setFirst = true;
469                             } else if (characterName.endsWith(" Last>")) {
470                                 System.err.println("*** Error 2 at " + code);
471                             } else {
472                                 if (prevCodeValue != curCodeValue - 1) {
473                                     appendNewChar(prevIndex, prevCodeValue);
474                                     appendNewChar(index, curCodeValue);
475                                 }
476                             }
477                         }
478                     } else {
479                         if (setFirst) {
480                             System.err.println("*** Error 3 at " + code);
481                         } else if (characterName.endsWith(" First>")) {
482                             setFirst = true;
483                         } else if (characterName.endsWith(" Last>")) {
484                             System.err.println("*** Error 4 at " + code);
485                         }
486                         appendNewChar(prevIndex, prevCodeValue);
487                         appendNewChar(index, curCodeValue);
488                         prevIndex = index;
489                     }
490                     prevCodeValue = curCodeValue;
491                 } else {
492                     if (ignoredNew.indexOf(category) == -1) {
493                         ignoredNew.append(category);
494                         ignoredNew.append(' ');
495                     }
496                 }
497             }
498             appendNewChar(prevIndex, prevCodeValue);
499 
500             bin.close();
501             fin.close();
502         }
503         catch (Exception e) {
504             System.err.println("Error occurred on accessing " + specfile);
505             e.printStackTrace();
506             System.exit(1);
507         }
508     }
509 
510     private static void appendNewChar(int index, int code) {
511         int bufLen = newList[index].length;
512         if (newTotalCount[index] == bufLen) {
513             int[] tmpBuf = new int[bufLen + 10];
514             System.arraycopy(newList[index], 0, tmpBuf, 0, bufLen);
515             newList[index] = tmpBuf;
516         }
517 
518         newList[index][newTotalCount[index]++] = code;
519         if (code < 0x10000) {
520             newListCount[BMP][index]++;
521         } else {
522             newListCount[nonBMP][index]++;
523         }
524     }
525 
526 
527     /* Generates the old CategoryMap. */
528     private static void generateOldDatafile() {
529         try {
530             FileWriter fout = new FileWriter(oldDatafile);
531             BufferedWriter bout = new BufferedWriter(fout);
532 
533             bout.write("\n    //\n    // The following String[][] can be used in CharSet.java as is.\n    //\n\n    private static final String[][] categoryMap = {\n");
534             for (int i = 0; i < categoryNames.length - 1; i++) {
535                 if (oldTotalCount[i] != 0) {
536                     bout.write("        { \"" + categoryNames[i] + "\",");
537 
538                     /* 0x0000-0xD7FF */
539                     if (oldListCount[BEFORE][i] != 0) {
540                         bout.write(" \"");
541 
542                         bout.write(oldList[BEFORE][i].toString() + "\"\n");
543                     }
544 
545                     /* 0xD800-0xFFFF */
546                     if (oldListCount[AFTER][i] != 0) {
547                         if (oldListCount[BEFORE][i] != 0) {
548                             bout.write("                + \"");
549                         } else {
550                             bout.write(" \"");
551                         }
552                         bout.write(oldList[AFTER][i].toString() + "\"\n");
553                     }
554 
555                     /* 0xD800DC00(0x10000)-0xDBFF0xDFFFF(0x10FFFF) */
556                     if (oldListCount[SURROGATE][i] != 0) {
557                         if (oldListCount[BEFORE][i] != 0 || oldListCount[AFTER][i] != 0) {
558                             bout.write("                + \"");
559                         } else {
560                             bout.write(" \"");
561                         }
562                         bout.write(oldList[SURROGATE][i].toString() + "\"\n");
563                     }
564                     bout.write("        },\n");
565 
566                 }
567             }
568             bout.write("    };\n\n");
569             bout.close();
570             fout.close();
571         }
572         catch (Exception e) {
573             System.err.println("Error occurred on accessing " + oldDatafile);
574             e.printStackTrace();
575             System.exit(1);
576         }
577 
578         System.out.println("\n" + oldDatafile + " has been generated.");
579     }
580 
581 
582     /**
583      * Test program to be generated
584      */
585     private static final String outfile = "CharacterCategoryTest.java";
586 
587     /*
588      * Generates a test program which compare the generated date (newer one)
589      * with the return values of Characger.getType().
590      */
591     private static void generateTestProgram() {
592         try {
593             FileWriter fout = new FileWriter(outfile);
594             BufferedWriter bout = new BufferedWriter(fout);
595 
596             bout.write(collationMethod);
597             bout.write("\n    //\n    // The following arrays can be used in CharSet.java as is.\n    //\n\n");
598 
599             bout.write("    private static final String[] categoryNames = {");
600             for (int i = 0; i < categoryNames.length - 1; i++) {
601                 if (i % 10 == 0) {
602                     bout.write("\n        ");
603                 }
604                 bout.write("\"" + categoryNames[i] + "\", ");
605             }
606             bout.write("\n    };\n\n");
607 
608             bout.write("    private static final int[][] categoryMap = {\n");
609 
610             for (int i = 0; i < categoryNames.length - 1; i++) {
611                 StringBuffer sb = new StringBuffer("        { /*  Data for \"" + categoryNames[i] + "\" category */");
612 
613                 for (int j = 0; j < newTotalCount[i]; j++) {
614                     if (j % 8 == 0) {
615                         sb.append("\n        ");
616                     }
617                     sb.append(" 0x");
618                     sb.append(Integer.toString(newList[i][j], 16).toUpperCase());
619                     sb.append(',');
620                 }
621                 sb.append("\n        },\n");
622                 bout.write(sb.toString());
623             }
624 
625             bout.write("    };\n");
626 
627             bout.write("\n}\n");
628 
629             bout.close();
630             fout.close();
631         }
632         catch (Exception e) {
633             System.err.println("Error occurred on accessing " + outfile);
634             e.printStackTrace();
635             System.exit(1);
636         }
637 
638         System.out.println("\n" + outfile + " has been generated.");
639     }
640 
641     static String collationMethod =
642 "public class CharacterCategoryTest {\n\n" +
643 "    static final int SIZE = 0x110000;\n" +
644 "    static final String[] category = {\n" +
645 "       \"Cn\", \"Lu\", \"Ll\", \"Lt\", \"Lm\", \"Lo\", \"Mn\", \"Me\",\n" +
646 "       \"Mc\", \"Nd\", \"Nl\", \"No\", \"Zs\", \"Zl\", \"Zp\", \"Cc\",\n" +
647 "       \"Cf\", \"\",   \"Co\", \"Cs\", \"Pd\", \"Ps\", \"Pe\", \"Pc\",\n" +
648 "       \"Po\", \"Sm\", \"Sc\", \"Sk\", \"So\", \"Pi\", \"Pf\"\n" +
649 "    };\n\n" +
650 "    public static void main(String[] args) {\n" +
651 "        boolean err = false;\n" +
652 "        byte[] b = new byte[SIZE];\n" +
653 "        for (int i = 0; i < SIZE; i++) {\n" +
654 "            b[i] = 0;\n" +
655 "        }\n" +
656 "        for (int i = 0; i < categoryMap.length; i++) {\n" +
657 "            byte categoryNum = 0;\n" +
658 "            String categoryName = categoryNames[i];\n" +
659 "            for (int j = 0; j < category.length; j++) {\n" +
660 "                if (categoryName.equals(category[j])) {\n" +
661 "                    categoryNum = (byte)j;\n" +
662 "                    break;\n" +
663 "                }\n" +
664 "            }\n" +
665 "            int[] values = categoryMap[i];\n" +
666 "            for (int j = 0; j < values.length;) {\n" +
667 "                int firstChar = values[j++];\n" +
668 "                int lastChar = values[j++];\n" +
669 "                for (int k = firstChar; k <= lastChar; k++) {\n" +
670 "                    b[k] = categoryNum;\n" +
671 "                }\n" +
672 "            }\n" +
673 "        }\n" +
674 "        for (int i = 0; i < SIZE; i++) {\n" +
675 "            int characterType = Character.getType(i);\n" +
676 "            if (b[i] != characterType) {\n" +
677 "                /* Co, Cs and Sk categories are ignored in CharacterCategory. */\n" +
678 "                if (characterType == Character.PRIVATE_USE ||\n" +
679 "                    characterType == Character.SURROGATE ||\n" +
680 "                    characterType == Character.MODIFIER_SYMBOL) {\n" +
681 "                    continue;\n" +
682 "                }\n" +
683 "                err = true;\n" +
684 "                System.err.println(\"Category conflict for a character(0x\" +\n" +
685 "                                   Integer.toHexString(i) +\n" +
686 "                                   \"). CharSet.categoryMap:\" +\n" +
687 "                                   category[b[i]] +\n" +
688 "                                   \"  Character.getType():\" +\n" +
689 "                                   category[characterType]);\n" +
690 "            }\n" +
691 "        }\n\n" +
692 "        if (err) {\n" +
693 "            throw new RuntimeException(\"Conflict occurred between Charset.categoryMap and Character.getType()\");\n" +
694 "        }\n" +
695 "    }\n";
696 
697 }